In [1]:
import pandas as pd
import numpy as np
from itertools import *

Step 0: Business quesitons¶

Goal: fill NaNs in dataset, so that it can be passed on to an ML model¶
  • I. fill NaNs
  • II. keep as many observations as possible
  • III. keep as many features as possible

Outline:¶

  1. load in data
  2. summary, count nans
  3. feature engineering
    • (a) datetime cols
    • (b) categorical cols
    • -> correlation matrix of each angle
  4. investigate nans
  5. options
    • selected mix of drop cols, ML regression model with xgboost, and drop rows
  6. solution part 1: drop cols
  7. solution part 2a: ML regression model with xgboost
  8. solution part 2b: drop rows for nan cols below accuracy threshold
  9. Summary
    • 91% of observations retained with accuracy of filling missing values of 80% or higher
    • final dataset is 131,000 observations
  10. export dataset

Step 1: load in data¶

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment',None)
np.set_printoptions(threshold=np.inf)

def open_data(repo, file):
    with open("".join([repo, file]), "r", errors="ignore") as r:
        file = pd.read_csv(r, engine="python", parse_dates=["Date"])
        print(file.info())
    return file

repo = r"C:/Users/benno/OneDrive/Python/Dash/Deployment_ready/rainfall_aus/assets/data/"
file = "weatherAUS.csv"

rain = open_data(repo, file)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Date           145460 non-null  datetime64[ns]
 1   Location       145460 non-null  object        
 2   MinTemp        143975 non-null  float64       
 3   MaxTemp        144199 non-null  float64       
 4   Rainfall       142199 non-null  float64       
 5   Evaporation    82670 non-null   float64       
 6   Sunshine       75625 non-null   float64       
 7   WindGustDir    135134 non-null  object        
 8   WindGustSpeed  135197 non-null  float64       
 9   WindDir9am     134894 non-null  object        
 10  WindDir3pm     141232 non-null  object        
 11  WindSpeed9am   143693 non-null  float64       
 12  WindSpeed3pm   142398 non-null  float64       
 13  Humidity9am    142806 non-null  float64       
 14  Humidity3pm    140953 non-null  float64       
 15  Pressure9am    130395 non-null  float64       
 16  Pressure3pm    130432 non-null  float64       
 17  Cloud9am       89572 non-null   float64       
 18  Cloud3pm       86102 non-null   float64       
 19  Temp9am        143693 non-null  float64       
 20  Temp3pm        141851 non-null  float64       
 21  RainToday      142199 non-null  object        
 22  RainTomorrow   142193 non-null  object        
dtypes: datetime64[ns](1), float64(16), object(6)
memory usage: 25.5+ MB
None

Step 2: summarize data¶

  • most cols have missing values
  • category of "highly sparse" columns @ 50-60% nonnulls
  • category of "90s" columns @ 90-99% nonnulls
In [3]:
import warnings
warnings.filterwarnings("ignore")

def summary_dataframe_small(df):

	return pd.DataFrame({'features': df.columns,'dtypes': [df.dtypes[i] for i in range(len(df.columns))],
	'nulls':[df[df.columns[i]].isnull().sum() for i in range(len(df.columns))],'nonnuls / 1.0': (df.shape[0] - 
	df.isnull().sum().values) / df.shape[0],'uniques': [df[j].nunique() for j in df.columns]})

summary_dataframe_small(rain)
Out[3]:
features dtypes nulls nonnuls / 1.0 uniques
0 Date datetime64[ns] 0 1.000000 3436
1 Location object 0 1.000000 49
2 MinTemp float64 1485 0.989791 389
3 MaxTemp float64 1261 0.991331 505
4 Rainfall float64 3261 0.977581 681
5 Evaporation float64 62790 0.568335 358
6 Sunshine float64 69835 0.519902 145
7 WindGustDir object 10326 0.929011 16
8 WindGustSpeed float64 10263 0.929445 67
9 WindDir9am object 10566 0.927361 16
10 WindDir3pm object 4228 0.970934 16
11 WindSpeed9am float64 1767 0.987852 43
12 WindSpeed3pm float64 3062 0.978950 44
13 Humidity9am float64 2654 0.981754 101
14 Humidity3pm float64 4507 0.969016 101
15 Pressure9am float64 15065 0.896432 546
16 Pressure3pm float64 15028 0.896686 549
17 Cloud9am float64 55888 0.615784 10
18 Cloud3pm float64 59358 0.591929 10
19 Temp9am float64 1767 0.987852 441
20 Temp3pm float64 3609 0.975189 502
21 RainToday object 3261 0.977581 2
22 RainTomorrow object 3267 0.977540 2
In [4]:
def summary_dataframe_large(df):
    
    summary = pd.DataFrame({
        "feature": df.columns,
        "dtype" : [str(df[col].dtype) for col in df.columns],
        "nulls": [f"{df[col].isnull().sum()}" for col in df.columns],
        "nonnulls%": [f"{round(((1 - df[col].isnull().sum() / len(df)) * 100), 2)}%" for col in df.columns],
        "STATS": ["|" for col in df.columns],
        "min": [df[col].min().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
        "50%": [df[col].median().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " "for col in df.columns],
        "max": [df[col].max().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
        "stdev": [df[col].std().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
        "mean": [df[col].mean().round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else " " for col in df.columns],
        "skew": ["" if not any(t in str(df[col].dtype) for t in ("float", "int")) else "left" if df[col].median() > df[col].mean() else "right" if df[col].median() < df[col].mean() else "equal" for col in df.columns],
        "cat_mode": [" " if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].mode()[0] for col in df.columns],        
        "cat_mode": [" " if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].mode()[0] for col in df.columns],
        "COUNTS": ["|" for col in df.columns],
        "nuniques": [df[col].nunique() for col in df.columns],
        "vcs-": [df[col].value_counts().reset_index()["count"].min() for col in df.columns],
        "vcs+": [df[col].value_counts().reset_index()["count"].max() for col in df.columns],
        "RANDOM": ["|" for col in df.columns],
        "row[random]" :[df[col].iloc[np.random.randint(len(df))].round(2) if any(t in str(df[col].dtype) for t in ("float", "int")) else df[col].iloc[np.random.randint(len(df))] for col in df.columns],
    })

    print(df.shape)

    return summary
    
summary = summary_dataframe_large(rain)
summary
(145460, 23)
Out[4]:
feature dtype nulls nonnulls% STATS min 50% max stdev mean skew cat_mode COUNTS nuniques vcs- vcs+ RANDOM row[random]
0 Date datetime64[ns] 0 100.0% | 2013-03-01 00:00:00 | 3436 1 49 | 2015-10-04 00:00:00
1 Location object 0 100.0% | Canberra | 49 1578 3436 | NorahHead
2 MinTemp float64 1485 98.98% | -8.5 12.0 33.9 6.4 12.19 right | 389 1 899 | 16.7
3 MaxTemp float64 1261 99.13% | -4.8 22.6 48.1 7.12 23.22 right | 505 1 885 | 21.3
4 Rainfall float64 3261 97.76% | 0.0 0.0 371.0 8.48 2.36 right | 681 1 91080 | 0.0
5 Evaporation float64 62790 56.83% | 0.0 4.8 145.0 4.19 5.47 right | 358 1 3339 | 7.0
6 Sunshine float64 69835 51.99% | 0.0 8.4 14.5 3.79 7.61 left | 145 1 2359 | 3.4
7 WindGustDir object 10326 92.9% | W | 16 6548 9915 | NE
8 WindGustSpeed float64 10263 92.94% | 6.0 39.0 135.0 13.61 40.04 right | 67 1 9215 | 50.0
9 WindDir9am object 10566 92.74% | N | 16 7024 11758 | SW
10 WindDir3pm object 4228 97.09% | SE | 16 6590 10838 | WNW
11 WindSpeed9am float64 1767 98.79% | 0.0 13.0 130.0 8.92 14.04 right | 43 1 13649 | 13.0
12 WindSpeed3pm float64 3062 97.89% | 0.0 19.0 87.0 8.81 18.66 left | 44 1 12580 | 17.0
13 Humidity9am float64 2654 98.18% | 0.0 70.0 100.0 19.03 68.88 left | 101 1 3391 | 79.0
14 Humidity3pm float64 4507 96.9% | 0.0 52.0 100.0 20.8 51.54 left | 101 4 2751 | 65.0
15 Pressure9am float64 15065 89.64% | 980.5 1017.6 1041.0 7.11 1017.65 right | 546 1 816 | 1018.8
16 Pressure3pm float64 15028 89.67% | 977.1 1015.2 1039.6 7.04 1015.26 right | 549 1 786 | NaN
17 Cloud9am float64 55888 61.58% | 0.0 5.0 9.0 2.89 4.45 left | 10 2 19972 | NaN
18 Cloud3pm float64 59358 59.19% | 0.0 5.0 9.0 2.72 4.51 left | 10 1 18229 | NaN
19 Temp9am float64 1767 98.79% | -7.2 16.7 40.2 6.49 16.99 right | 441 1 912 | 13.3
20 Temp3pm float64 3609 97.52% | -5.4 21.1 46.7 6.94 21.68 right | 502 1 882 | 23.2
21 RainToday object 3261 97.76% | No | 2 31880 110319 | No
22 RainTomorrow object 3267 97.75% | No | 2 31877 110316 | Yes
In [5]:
rain.shape
Out[5]:
(145460, 23)

Step 3a: Feature Engineering, datetime¶

  • datetime cols
In [6]:
rain_ml = rain.copy()
In [7]:
from datetime import datetime as dt

for df in [rain_ml]:
    df['Datetime'] = df['Date'].apply(lambda x: dt.strptime(str(x), "%Y-%m-%d %H:%M:%S"))
    df['Weekday'] = [t.day_name() for t in df.Datetime]
    df['Month'] = [t.month_name() for t in df.Datetime]
    df['Day'] = [t.day for t in df.Datetime]
    df['Hour'] = [t.hour for t in df.Datetime]
    df['Minute'] = [t.minute for t in df.Datetime]
    df['Date'] = [t.date() for t in df.Datetime]
In [8]:
# drop "datetime" dtype cols as some ml models cannot handle this datatype
rain_ml = rain_ml.drop(["Datetime", "Date"], axis=1)
Correlation, angle (A)¶
  • after datetime col creation
  • before onehot encoding
In [9]:
import plotly.io as pio
import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go
from plotly.subplots import make_subplots

def heatmap_allfeatures_pearson(df, height=800, colors="Oryel"):

    df = df.select_dtypes(["number"])

    corr = df.corr().round(2)
    mask = np.zeros_like(corr, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    corr = corr.mask(mask)

    g = px.imshow(corr, text_auto=True, color_continuous_scale=colors, zmin=-1, zmax=1)

    g.update_layout(
    { "template": "plotly_white",
     "title": "heatmap, pearson correlation",
    "xaxis":{"showgrid": False,
             "zeroline": False},
    "yaxis": {"showgrid": False,
             "zeroline": False},
     "height": height,
   }).update_traces(xgap=2,ygap=2)

    return g, corr

fig, corr = heatmap_allfeatures_pearson(rain_ml, height=700, colors="RdYlBu_r")
fig

Step 3b: Feature Engineering of categoricals¶

  • onehot encoding
In [10]:
def pull_categoricals(df):
    return [j for j in df.columns if "object" in str(df[j].dtype)]

categoricals = pull_categoricals(rain_ml)
In [11]:
categoricals
Out[11]:
['Location',
 'WindGustDir',
 'WindDir9am',
 'WindDir3pm',
 'RainToday',
 'RainTomorrow',
 'Weekday',
 'Month']
In [12]:
def pull_continuous(df, categoricals):
    return [j for j in df.columns if j not in categoricals]

continuous = pull_continuous(rain_ml, categoricals)
In [13]:
from sklearn.preprocessing import OneHotEncoder

def onehot_final(df, categoricals):	

	return pd.DataFrame(OneHotEncoder().fit_transform(df[categoricals].astype(str)).toarray(),
    columns = [i for j in [[str(j) + '_' + str(i) for i in np.unique(df[j].astype(str).values)] for j in categoricals] for i in j]
    ).reset_index().rename(columns={"index": "id"}).merge(df.drop(categoricals, axis=1).reset_index().rename(columns={"index": "id"})).drop(["id"], axis=1)

rain_ml = onehot_final(rain_ml, categoricals)
Correlation, angle (B)¶
  • onehot encoded columns
  • zoom in to see data vis
In [53]:
fig, corr = heatmap_allfeatures_pearson(rain_ml, height=700, colors="RdYlBu_r")
fig

Step 4: investigate NaNs¶

In [15]:
summary_dataframe_small(rain_ml)
Out[15]:
features dtypes nulls nonnuls / 1.0 uniques
0 Location_Adelaide float64 0 1.000000 2
1 Location_Albany float64 0 1.000000 2
2 Location_Albury float64 0 1.000000 2
3 Location_AliceSprings float64 0 1.000000 2
4 Location_BadgerysCreek float64 0 1.000000 2
5 Location_Ballarat float64 0 1.000000 2
6 Location_Bendigo float64 0 1.000000 2
7 Location_Brisbane float64 0 1.000000 2
8 Location_Cairns float64 0 1.000000 2
9 Location_Canberra float64 0 1.000000 2
10 Location_Cobar float64 0 1.000000 2
11 Location_CoffsHarbour float64 0 1.000000 2
12 Location_Dartmoor float64 0 1.000000 2
13 Location_Darwin float64 0 1.000000 2
14 Location_GoldCoast float64 0 1.000000 2
15 Location_Hobart float64 0 1.000000 2
16 Location_Katherine float64 0 1.000000 2
17 Location_Launceston float64 0 1.000000 2
18 Location_Melbourne float64 0 1.000000 2
19 Location_MelbourneAirport float64 0 1.000000 2
20 Location_Mildura float64 0 1.000000 2
21 Location_Moree float64 0 1.000000 2
22 Location_MountGambier float64 0 1.000000 2
23 Location_MountGinini float64 0 1.000000 2
24 Location_Newcastle float64 0 1.000000 2
25 Location_Nhil float64 0 1.000000 2
26 Location_NorahHead float64 0 1.000000 2
27 Location_NorfolkIsland float64 0 1.000000 2
28 Location_Nuriootpa float64 0 1.000000 2
29 Location_PearceRAAF float64 0 1.000000 2
30 Location_Penrith float64 0 1.000000 2
31 Location_Perth float64 0 1.000000 2
32 Location_PerthAirport float64 0 1.000000 2
33 Location_Portland float64 0 1.000000 2
34 Location_Richmond float64 0 1.000000 2
35 Location_Sale float64 0 1.000000 2
36 Location_SalmonGums float64 0 1.000000 2
37 Location_Sydney float64 0 1.000000 2
38 Location_SydneyAirport float64 0 1.000000 2
39 Location_Townsville float64 0 1.000000 2
40 Location_Tuggeranong float64 0 1.000000 2
41 Location_Uluru float64 0 1.000000 2
42 Location_WaggaWagga float64 0 1.000000 2
43 Location_Walpole float64 0 1.000000 2
44 Location_Watsonia float64 0 1.000000 2
45 Location_Williamtown float64 0 1.000000 2
46 Location_Witchcliffe float64 0 1.000000 2
47 Location_Wollongong float64 0 1.000000 2
48 Location_Woomera float64 0 1.000000 2
49 WindGustDir_E float64 0 1.000000 2
50 WindGustDir_ENE float64 0 1.000000 2
51 WindGustDir_ESE float64 0 1.000000 2
52 WindGustDir_N float64 0 1.000000 2
53 WindGustDir_NE float64 0 1.000000 2
54 WindGustDir_NNE float64 0 1.000000 2
55 WindGustDir_NNW float64 0 1.000000 2
56 WindGustDir_NW float64 0 1.000000 2
57 WindGustDir_S float64 0 1.000000 2
58 WindGustDir_SE float64 0 1.000000 2
59 WindGustDir_SSE float64 0 1.000000 2
60 WindGustDir_SSW float64 0 1.000000 2
61 WindGustDir_SW float64 0 1.000000 2
62 WindGustDir_W float64 0 1.000000 2
63 WindGustDir_WNW float64 0 1.000000 2
64 WindGustDir_WSW float64 0 1.000000 2
65 WindGustDir_nan float64 0 1.000000 2
66 WindDir9am_E float64 0 1.000000 2
67 WindDir9am_ENE float64 0 1.000000 2
68 WindDir9am_ESE float64 0 1.000000 2
69 WindDir9am_N float64 0 1.000000 2
70 WindDir9am_NE float64 0 1.000000 2
71 WindDir9am_NNE float64 0 1.000000 2
72 WindDir9am_NNW float64 0 1.000000 2
73 WindDir9am_NW float64 0 1.000000 2
74 WindDir9am_S float64 0 1.000000 2
75 WindDir9am_SE float64 0 1.000000 2
76 WindDir9am_SSE float64 0 1.000000 2
77 WindDir9am_SSW float64 0 1.000000 2
78 WindDir9am_SW float64 0 1.000000 2
79 WindDir9am_W float64 0 1.000000 2
80 WindDir9am_WNW float64 0 1.000000 2
81 WindDir9am_WSW float64 0 1.000000 2
82 WindDir9am_nan float64 0 1.000000 2
83 WindDir3pm_E float64 0 1.000000 2
84 WindDir3pm_ENE float64 0 1.000000 2
85 WindDir3pm_ESE float64 0 1.000000 2
86 WindDir3pm_N float64 0 1.000000 2
87 WindDir3pm_NE float64 0 1.000000 2
88 WindDir3pm_NNE float64 0 1.000000 2
89 WindDir3pm_NNW float64 0 1.000000 2
90 WindDir3pm_NW float64 0 1.000000 2
91 WindDir3pm_S float64 0 1.000000 2
92 WindDir3pm_SE float64 0 1.000000 2
93 WindDir3pm_SSE float64 0 1.000000 2
94 WindDir3pm_SSW float64 0 1.000000 2
95 WindDir3pm_SW float64 0 1.000000 2
96 WindDir3pm_W float64 0 1.000000 2
97 WindDir3pm_WNW float64 0 1.000000 2
98 WindDir3pm_WSW float64 0 1.000000 2
99 WindDir3pm_nan float64 0 1.000000 2
100 RainToday_No float64 0 1.000000 2
101 RainToday_Yes float64 0 1.000000 2
102 RainToday_nan float64 0 1.000000 2
103 RainTomorrow_No float64 0 1.000000 2
104 RainTomorrow_Yes float64 0 1.000000 2
105 RainTomorrow_nan float64 0 1.000000 2
106 Weekday_Friday float64 0 1.000000 2
107 Weekday_Monday float64 0 1.000000 2
108 Weekday_Saturday float64 0 1.000000 2
109 Weekday_Sunday float64 0 1.000000 2
110 Weekday_Thursday float64 0 1.000000 2
111 Weekday_Tuesday float64 0 1.000000 2
112 Weekday_Wednesday float64 0 1.000000 2
113 Month_April float64 0 1.000000 2
114 Month_August float64 0 1.000000 2
115 Month_December float64 0 1.000000 2
116 Month_February float64 0 1.000000 2
117 Month_January float64 0 1.000000 2
118 Month_July float64 0 1.000000 2
119 Month_June float64 0 1.000000 2
120 Month_March float64 0 1.000000 2
121 Month_May float64 0 1.000000 2
122 Month_November float64 0 1.000000 2
123 Month_October float64 0 1.000000 2
124 Month_September float64 0 1.000000 2
125 MinTemp float64 1485 0.989791 389
126 MaxTemp float64 1261 0.991331 505
127 Rainfall float64 3261 0.977581 681
128 Evaporation float64 62790 0.568335 358
129 Sunshine float64 69835 0.519902 145
130 WindGustSpeed float64 10263 0.929445 67
131 WindSpeed9am float64 1767 0.987852 43
132 WindSpeed3pm float64 3062 0.978950 44
133 Humidity9am float64 2654 0.981754 101
134 Humidity3pm float64 4507 0.969016 101
135 Pressure9am float64 15065 0.896432 546
136 Pressure3pm float64 15028 0.896686 549
137 Cloud9am float64 55888 0.615784 10
138 Cloud3pm float64 59358 0.591929 10
139 Temp9am float64 1767 0.987852 441
140 Temp3pm float64 3609 0.975189 502
141 Day int64 0 1.000000 31
142 Hour int64 0 1.000000 1
143 Minute int64 0 1.000000 1
In [16]:
rain_ml.columns[-19:]
Out[16]:
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'Day', 'Hour', 'Minute'],
      dtype='object')
In [17]:
nancols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm',]
In [18]:
nan1_mk = ( rain_ml["MinTemp"].isnull() )
s0_mk = ( rain_ml["Sunshine"] == 0 )
c0_mk = (rain_ml["Cloud9am"] == 0 )

# rain_ml[nan_mk][:50]
# rain_ml[s0_mk].iloc[:50, -19:]
# rain_ml[c0_mk].iloc[:50, -19:]

Step 4: Findings¶

data appears to be MCAR, with the missing data not forming a pattern

Step 5: Options to deal with missing values¶

Problem 1: 50-60% missing value columns¶
  • option 0: gather new data
  • option 1: drop rows (keep about 35% of the data)
  • option 2: drop columns (keep all observation-wise data, don't consider cols with nulls)
Problem 2: 90-99% missing value columns¶
  • option 0: gather new data
  • option 1: drop rows (keep about 80% of the data)
  • option 2: drop columns (this would be the majority of the data deleted)
  • option 3: interpolation with ML model (guess the values using the data at hand; 83-98% accuracy)
Solutions¶
  • problem 1:
    • option 2
  • problem 2:
    • option 3, then option 1

Step 6: Solution 1), Drop Cols¶

drop class of feature with 50-60% nonnulls

In [19]:
dropcols = ["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"]
rain_ml = rain_ml.drop(dropcols, axis=1)

Step 7: Solution 2.i) ML Regression Model¶

  • -> 1x: test performance among all models
  • -> 2x: select model, remove other models, predict, and interpolate
In [54]:
# 1x:

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

model_lst = [
    LinearRegression(n_jobs=-1),
    DecisionTreeRegressor(),
    RandomForestRegressor(n_jobs=-1),
    GradientBoostingRegressor(),
    xgb.XGBRegressor(n_jobs=-1),
]
In [21]:
# 2x:

model_lst = [
    xgb.XGBRegressor(n_jobs=-1),
]
In [22]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def model_selection_kfold_regression(models, df, dep_var): 

    df = df.dropna().reset_index(drop=True)

    data = df.drop([dep_var],axis=1).squeeze()
    target = df[[dep_var]].squeeze()

    idx = np.random.choice(range(0,len(df)), size=round(len(df)*0.80), replace=False)

    x_train = data.iloc[idx].reset_index(drop=True)
    y_train = target.iloc[idx].reset_index(drop=True)

    x_test = data.iloc[data.index.difference(idx)].reset_index(drop=True)
    y_test = target.iloc[data.index.difference(idx)].reset_index(drop=True)    
    
    scores_dct = {}    
    
    for model in models:
        
        clf = model

        clf.fit(x_train, y_train)
                
        preds = clf.predict(x_test)

        cv_score = np.mean(cross_val_score(clf, data, target, cv=KFold(n_splits=5, shuffle=True, random_state=42)))
        
        scores_dct[str(model)] = {}
        scores_dct[str(model)]["model"] = model        
        scores_dct[str(model)]["preds"] = preds        
        scores_dct[str(model)]["scores"] = {
            'mean_squared_error':metrics.mean_squared_error(preds, y_test),
             'train':clf.score(x_train, y_train),
             'test':clf.score(x_test, y_test),
             'cv':cv_score
        }
        
    
    return scores_dct
In [23]:
def scores_dct_to_df(dct):

    df = pd.DataFrame(
        index=["XGBoost" if "base_score" in str(j) else str(j) for j in dct.keys()],
        columns=list(np.array(list(dct.values()))[0]["scores"].keys()),
        data=[list(j["scores"].values()) for j in list(dct.values())]
    )

    return df
In [24]:
def interpolate_via_model(df, column, model):

    interpolate = pd.DataFrame({
        "true": df[column],
        "preds": model.predict(df.drop([column], axis=1)),
    })

    nan_mk = ( interpolate["true"].isnull() )
    interpolate["true"][nan_mk] = interpolate["preds"][nan_mk]

    return interpolate["true"]
Step 7a) feature "MinTemp"¶
  • 1x: sampling of models
In [55]:
model_dct_mintemp = model_selection_kfold_regression(model_lst, rain_ml, "MinTemp")
scores_df = scores_dct_to_df(model_dct_mintemp)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[55]:
mean_squared_error train test cv
XGBoost 2.939828 0.943602 0.929640 0.930224
RandomForestRegressor(n_jobs=-1) 3.406263 0.988639 0.918477 0.919240
LinearRegression(n_jobs=-1) 3.645535 0.914853 0.912750 0.914249
GradientBoostingRegressor() 3.941957 0.908978 0.905656 0.907212
DecisionTreeRegressor() 7.203550 1.000000 0.827595 0.830032
In [26]:
model_mintemp = model_dct_mintemp[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["MinTemp"] = interpolate_via_model(rain_ml, "MinTemp", model_mintemp)
Step 7b) feature "MaxTemp"¶
  • 2x: after viewing above format for all features, select model that perfoms the best
In [27]:
scores_dct_maxtemp = model_selection_kfold_regression(model_lst, rain_ml, "MaxTemp")
scores_df = scores_dct_to_df(scores_dct_maxtemp)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[27]:
mean_squared_error train test cv
XGBoost 0.715151 0.989359 0.985244 0.985408
In [28]:
model_maxtemp = scores_dct_maxtemp[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["MaxTemp"] = interpolate_via_model(rain_ml, "MaxTemp", model_maxtemp)
Step 7c) feature "Rainfall"¶
  • model performed below threshold of 80%; do not use model to interpoate
In [29]:
scores_dct_rainfall = model_selection_kfold_regression(model_lst, rain_ml, "Rainfall")
scores_df = scores_dct_to_df(scores_dct_rainfall)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[29]:
mean_squared_error train test cv
XGBoost 35.722158 0.769334 0.420896 0.422957
In [30]:
# model_rainfall = scores_dct_rainfall["XGBRegress"]["model"]
# rain_ml["Rainfall"] = interpolate_via_model(rain_ml, "Rainfall", model_rainfall)
Step 7d) feature "WindSpeed9am"¶
  • model performed below threshold of 80%; do not use model to interpoate
In [31]:
# 1:
scores_dct_ws9am = model_selection_kfold_regression(model_lst, rain_ml, "WindSpeed9am")
scores_df = scores_dct_to_df(scores_dct_ws9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[31]:
mean_squared_error train test cv
XGBoost 24.398231 0.757035 0.689277 0.690469
In [32]:
# model_ws9am = scores_dct_ws9am["XGBRegress"]["model"]
# rain_ml["WindSpeed9am"] = interpolate_via_model(rain_ml, "WindSpeed9am", model_ws9am)
Step 7e) feature "WindSpeed3pm"¶
  • model performed below threshold of 80%; do not use model to interpoate
In [33]:
scores_dct_ws3pm = model_selection_kfold_regression(model_lst, rain_ml, "WindSpeed3pm")
scores_df = scores_dct_to_df(scores_dct_ws3pm)
scores_df.sort_values(by=["mean_squared_error"], ascending=True)
Out[33]:
mean_squared_error train test cv
XGBoost 22.767067 0.756524 0.698011 0.691884
In [34]:
# model_ws3pm = scores_dct_ws3pm["XGBRegress"]["model"]
# rain_ml["WindSpeed3pm"] = interpolate_via_model(rain_ml, "WindSpeed3pm", model_ws3pm)
Step 7f) feature "Humidity9am"¶
In [35]:
scores_dct_h9am = model_selection_kfold_regression(model_lst, rain_ml, "Humidity9am")
scores_df = scores_dct_to_df(scores_dct_h9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[35]:
mean_squared_error train test cv
XGBoost 58.993709 0.871371 0.838998 0.838083
In [36]:
model_h9am = scores_dct_h9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Humidity9am"] = interpolate_via_model(rain_ml, "Humidity9am", model_h9am)
Step 7g) feature "Humidity3pm"¶
In [37]:
scores_dct_h3pm = model_selection_kfold_regression(model_lst, rain_ml, "Humidity3pm")
scores_df = scores_dct_to_df(scores_dct_h3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[37]:
mean_squared_error train test cv
XGBoost 51.932807 0.906245 0.879736 0.878621
In [38]:
model_h3pm = scores_dct_h3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Humidity3pm"] = interpolate_via_model(rain_ml, "Humidity3pm", model_h3pm)
Step 7h) feature "Pressure9am"¶
In [39]:
scores_dct_p9am = model_selection_kfold_regression(model_lst, rain_ml, "Pressure9am")
scores_df = scores_dct_to_df(scores_dct_p9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[39]:
mean_squared_error train test cv
XGBoost 1.489629 0.977829 0.970852 0.970344
In [40]:
model_p9am = scores_dct_p9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Pressure9am"] = interpolate_via_model(rain_ml, "Pressure9am", model_p9am)
Step 7i) feature "Pressure3pm"¶
In [41]:
# 1
scores_dct_p3pm = model_selection_kfold_regression(model_lst, rain_ml, "Pressure3pm")
scores_df = scores_dct_to_df(scores_dct_p3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[41]:
mean_squared_error train test cv
XGBoost 1.490953 0.977034 0.96956 0.969553
In [42]:
model_p3pm = scores_dct_p3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Pressure3pm"] = interpolate_via_model(rain_ml, "Pressure3pm", model_p3pm)
Step 7j) feature "Temp9am"¶
In [43]:
scores_dct_t9am = model_selection_kfold_regression(model_lst, rain_ml, "Temp9am")
scores_df = scores_dct_to_df(scores_dct_t9am)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[43]:
mean_squared_error train test cv
XGBoost 1.167374 0.978179 0.972661 0.972582
In [44]:
model_t9am = scores_dct_t9am[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Temp9am"] = interpolate_via_model(rain_ml, "Temp9am", model_t9am)
Step 7j) feature "Temp3pm"¶
In [45]:
scores_dct_t3pm = model_selection_kfold_regression(model_lst, rain_ml, "Temp3pm")
scores_df = scores_dct_to_df(scores_dct_t3pm)
scores_df.sort_values(by=["mean_squared_error"],ascending=True)
Out[45]:
mean_squared_error train test cv
XGBoost 0.646544 0.990824 0.98678 0.987138
In [46]:
model_t3pm = scores_dct_t3pm[list(model_dct_mintemp.keys())[0]]["model"]
rain_ml["Temp3pm"] = interpolate_via_model(rain_ml, "Temp3pm", model_t3pm)

Step 7: Summary¶

  • 3 features did not perform well enough to use interpolation methods
  • 8 features did perform well enough, with scores test && cv >= 80%

Step 8: Solution 2.ii) Drop Rows¶

where 3 above features are nan

In [47]:
rain_ml1 = rain_ml.copy()
In [48]:
rain_ml1 = rain_ml1.dropna()
In [49]:
rain_ml1.shape
Out[49]:
(132181, 140)

Step 9: Summarize¶

  • Final removed columns numbers 4
  • Final row-wise retention is ~132,000 / ~145,000, or 90.9% retention of observations
  • Final shape is 132,181 x 140 with 0 null values
In [50]:
rain_ml1.shape[0] / len(rain_ml)
Out[50]:
0.9087102983638113

Step 10: Export data¶

locally

In [51]:
repo = r"C:/Users/benno/OneDrive/Python/Dash/Deployment_ready/rainfall_aus/assets/data/"
file = "weatherAUS_cleaned.csv"

rain_ml1.to_csv("".join([repo, file]), index=False)
In [52]:
summary_dataframe_large(rain_ml1)
(132181, 140)
Out[52]:
feature dtype nulls nonnulls% STATS min 50% max stdev mean skew cat_mode COUNTS nuniques vcs- vcs+ RANDOM row[random]
0 Location_Adelaide float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3064 129117 | 0.0
1 Location_Albany float64 0 100.0% | 0.00 0.0 0.00 0.00 0.00 equal | 1 132181 132181 | 0.0
2 Location_Albury float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2984 129197 | 0.0
3 Location_AliceSprings float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2978 129203 | 0.0
4 Location_BadgerysCreek float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2864 129317 | 0.0
5 Location_Ballarat float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2992 129189 | 0.0
6 Location_Bendigo float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2993 129188 | 0.0
7 Location_Brisbane float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3122 129059 | 0.0
8 Location_Cairns float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2969 129212 | 0.0
9 Location_Canberra float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3073 129108 | 0.0
10 Location_Cobar float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2944 129237 | 0.0
11 Location_CoffsHarbour float64 0 100.0% | 0.00 0.0 1.00 0.14 0.02 right | 2 2632 129549 | 0.0
12 Location_Dartmoor float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2934 129247 | 0.0
13 Location_Darwin float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3148 129033 | 0.0
14 Location_GoldCoast float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2885 129296 | 0.0
15 Location_Hobart float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3164 129017 | 0.0
16 Location_Katherine float64 0 100.0% | 0.00 0.0 1.00 0.11 0.01 right | 2 1529 130652 | 0.0
17 Location_Launceston float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2991 129190 | 0.0
18 Location_Melbourne float64 0 100.0% | 0.00 0.0 1.00 0.13 0.02 right | 2 2419 129762 | 0.0
19 Location_MelbourneAirport float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2993 129188 | 0.0
20 Location_Mildura float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2999 129182 | 0.0
21 Location_Moree float64 0 100.0% | 0.00 0.0 1.00 0.14 0.02 right | 2 2776 129405 | 0.0
22 Location_MountGambier float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2979 129202 | 1.0
23 Location_MountGinini float64 0 100.0% | 0.00 0.0 1.00 0.14 0.02 right | 2 2563 129618 | 0.0
24 Location_Newcastle float64 0 100.0% | 0.00 0.0 0.00 0.00 0.00 equal | 1 132181 132181 | 0.0
25 Location_Nhil float64 0 100.0% | 0.00 0.0 1.00 0.11 0.01 right | 2 1563 130618 | 0.0
26 Location_NorahHead float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2883 129298 | 0.0
27 Location_NorfolkIsland float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2913 129268 | 0.0
28 Location_Nuriootpa float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2941 129240 | 0.0
29 Location_PearceRAAF float64 0 100.0% | 0.00 0.0 1.00 0.14 0.02 right | 2 2628 129553 | 0.0
30 Location_Penrith float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2910 129271 | 0.0
31 Location_Perth float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3188 128993 | 0.0
32 Location_PerthAirport float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2961 129220 | 0.0
33 Location_Portland float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2960 129221 | 0.0
34 Location_Richmond float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2886 129295 | 0.0
35 Location_Sale float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2858 129323 | 0.0
36 Location_SalmonGums float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2903 129278 | 0.0
37 Location_Sydney float64 0 100.0% | 0.00 0.0 1.00 0.13 0.02 right | 2 2298 129883 | 0.0
38 Location_SydneyAirport float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2953 129228 | 0.0
39 Location_Townsville float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 3008 129173 | 0.0
40 Location_Tuggeranong float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2943 129238 | 0.0
41 Location_Uluru float64 0 100.0% | 0.00 0.0 1.00 0.11 0.01 right | 2 1483 130698 | 0.0
42 Location_WaggaWagga float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2948 129233 | 0.0
43 Location_Walpole float64 0 100.0% | 0.00 0.0 1.00 0.14 0.02 right | 2 2769 129412 | 0.0
44 Location_Watsonia float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2964 129217 | 0.0
45 Location_Williamtown float64 0 100.0% | 0.00 0.0 1.00 0.13 0.02 right | 2 2432 129749 | 0.0
46 Location_Witchcliffe float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2919 129262 | 0.0
47 Location_Wollongong float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2928 129253 | 0.0
48 Location_Woomera float64 0 100.0% | 0.00 0.0 1.00 0.15 0.02 right | 2 2947 129234 | 0.0
49 WindGustDir_E float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 9022 123159 | 0.0
50 WindGustDir_ENE float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 7956 124225 | 1.0
51 WindGustDir_ESE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.05 right | 2 7262 124919 | 0.0
52 WindGustDir_N float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 9009 123172 | 0.0
53 WindGustDir_NE float64 0 100.0% | 0.00 0.0 1.00 0.22 0.05 right | 2 7017 125164 | 0.0
54 WindGustDir_NNE float64 0 100.0% | 0.00 0.0 1.00 0.21 0.05 right | 2 6407 125774 | 0.0
55 WindGustDir_NNW float64 0 100.0% | 0.00 0.0 1.00 0.22 0.05 right | 2 6513 125668 | 0.0
56 WindGustDir_NW float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 7971 124210 | 0.0
57 WindGustDir_S float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8889 123292 | 0.0
58 WindGustDir_SE float64 0 100.0% | 0.00 0.0 1.00 0.26 0.07 right | 2 9268 122913 | 0.0
59 WindGustDir_SSE float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8947 123234 | 1.0
60 WindGustDir_SSW float64 0 100.0% | 0.00 0.0 1.00 0.25 0.06 right | 2 8557 123624 | 0.0
61 WindGustDir_SW float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8754 123427 | 0.0
62 WindGustDir_W float64 0 100.0% | 0.00 0.0 1.00 0.26 0.07 right | 2 9705 122476 | 0.0
63 WindGustDir_WNW float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 8020 124161 | 0.0
64 WindGustDir_WSW float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8845 123336 | 0.0
65 WindGustDir_nan float64 0 100.0% | 0.00 0.0 1.00 0.02 0.00 right | 2 39 132142 | 0.0
66 WindDir9am_E float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8660 123521 | 0.0
67 WindDir9am_ENE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7480 124701 | 0.0
68 WindDir9am_ESE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7291 124890 | 0.0
69 WindDir9am_N float64 0 100.0% | 0.00 0.0 1.00 0.28 0.08 right | 2 10901 121280 | 0.0
70 WindDir9am_NE float64 0 100.0% | 0.00 0.0 1.00 0.22 0.05 right | 2 7033 125148 | 0.0
71 WindDir9am_NNE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7682 124499 | 0.0
72 WindDir9am_NNW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7450 124731 | 0.0
73 WindDir9am_NW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7503 124678 | 0.0
74 WindDir9am_S float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 8121 124060 | 0.0
75 WindDir9am_SE float64 0 100.0% | 0.00 0.0 1.00 0.25 0.06 right | 2 8525 123656 | 0.0
76 WindDir9am_SSE float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8722 123459 | 0.0
77 WindDir9am_SSW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.05 right | 2 7123 125058 | 0.0
78 WindDir9am_SW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7684 124497 | 0.0
79 WindDir9am_W float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7496 124685 | 0.0
80 WindDir9am_WNW float64 0 100.0% | 0.00 0.0 1.00 0.22 0.05 right | 2 6817 125364 | 0.0
81 WindDir9am_WSW float64 0 100.0% | 0.00 0.0 1.00 0.22 0.05 right | 2 6507 125674 | 0.0
82 WindDir9am_nan float64 0 100.0% | 0.00 0.0 1.00 0.23 0.05 right | 2 7186 124995 | 0.0
83 WindDir3pm_E float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 7759 124422 | 0.0
84 WindDir3pm_ENE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7471 124710 | 0.0
85 WindDir3pm_ESE float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 7847 124334 | 0.0
86 WindDir3pm_N float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 8427 123754 | 1.0
87 WindDir3pm_NE float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7719 124462 | 0.0
88 WindDir3pm_NNE float64 0 100.0% | 0.00 0.0 1.00 0.21 0.05 right | 2 6318 125863 | 0.0
89 WindDir3pm_NNW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7477 124704 | 0.0
90 WindDir3pm_NW float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 7838 124343 | 0.0
91 WindDir3pm_S float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 9209 122972 | 0.0
92 WindDir3pm_SE float64 0 100.0% | 0.00 0.0 1.00 0.26 0.07 right | 2 9391 122790 | 0.0
93 WindDir3pm_SSE float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8844 123337 | 0.0
94 WindDir3pm_SSW float64 0 100.0% | 0.00 0.0 1.00 0.23 0.06 right | 2 7728 124453 | 1.0
95 WindDir3pm_SW float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8680 123501 | 1.0
96 WindDir3pm_W float64 0 100.0% | 0.00 0.0 1.00 0.26 0.07 right | 2 9498 122683 | 0.0
97 WindDir3pm_WNW float64 0 100.0% | 0.00 0.0 1.00 0.24 0.06 right | 2 8413 123768 | 0.0
98 WindDir3pm_WSW float64 0 100.0% | 0.00 0.0 1.00 0.25 0.07 right | 2 8861 123320 | 0.0
99 WindDir3pm_nan float64 0 100.0% | 0.00 0.0 1.00 0.07 0.01 right | 2 701 131480 | 0.0
100 RainToday_No float64 0 100.0% | 0.00 1.0 1.00 0.42 0.78 left | 2 29267 102914 | 1.0
101 RainToday_Yes float64 0 100.0% | 0.00 0.0 1.00 0.42 0.22 right | 2 29267 102914 | 1.0
102 RainToday_nan float64 0 100.0% | 0.00 0.0 0.00 0.00 0.00 equal | 1 132181 132181 | 0.0
103 RainTomorrow_No float64 0 100.0% | 0.00 1.0 1.00 0.42 0.78 left | 2 29575 102606 | 1.0
104 RainTomorrow_Yes float64 0 100.0% | 0.00 0.0 1.00 0.41 0.22 right | 2 28730 103451 | 0.0
105 RainTomorrow_nan float64 0 100.0% | 0.00 0.0 1.00 0.08 0.01 right | 2 845 131336 | 0.0
106 Weekday_Friday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 18810 113371 | 0.0
107 Weekday_Monday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 19045 113136 | 0.0
108 Weekday_Saturday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 18557 113624 | 1.0
109 Weekday_Sunday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 18779 113402 | 0.0
110 Weekday_Thursday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 18972 113209 | 0.0
111 Weekday_Tuesday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 19025 113156 | 0.0
112 Weekday_Wednesday float64 0 100.0% | 0.00 0.0 1.00 0.35 0.14 right | 2 18993 113188 | 1.0
113 Month_April float64 0 100.0% | 0.00 0.0 1.00 0.27 0.08 right | 2 10565 121616 | 0.0
114 Month_August float64 0 100.0% | 0.00 0.0 1.00 0.28 0.08 right | 2 10912 121269 | 0.0
115 Month_December float64 0 100.0% | 0.00 0.0 1.00 0.26 0.08 right | 2 10023 122158 | 0.0
116 Month_February float64 0 100.0% | 0.00 0.0 1.00 0.26 0.07 right | 2 9743 122438 | 0.0
117 Month_January float64 0 100.0% | 0.00 0.0 1.00 0.29 0.09 right | 2 11979 120202 | 0.0
118 Month_July float64 0 100.0% | 0.00 0.0 1.00 0.28 0.08 right | 2 10968 121213 | 0.0
119 Month_June float64 0 100.0% | 0.00 0.0 1.00 0.28 0.09 right | 2 11542 120639 | 0.0
120 Month_March float64 0 100.0% | 0.00 0.0 1.00 0.29 0.09 right | 2 12134 120047 | 0.0
121 Month_May float64 0 100.0% | 0.00 0.0 1.00 0.29 0.09 right | 2 12168 120013 | 0.0
122 Month_November float64 0 100.0% | 0.00 0.0 1.00 0.27 0.08 right | 2 10671 121510 | 0.0
123 Month_October float64 0 100.0% | 0.00 0.0 1.00 0.28 0.08 right | 2 11001 121180 | 0.0
124 Month_September float64 0 100.0% | 0.00 0.0 1.00 0.27 0.08 right | 2 10475 121706 | 0.0
125 MinTemp float64 0 100.0% | -8.50 11.9 33.90 6.47 12.15 right | 489 1 819 | 4.0
126 MaxTemp float64 0 100.0% | -4.80 22.8 48.10 7.20 23.31 right | 570 1 755 | 24.4
127 Rainfall float64 0 100.0% | 0.00 0.0 367.60 8.40 2.33 right | 653 1 85032 | 0.4
128 WindGustSpeed float64 0 100.0% | 6.00 39.0 135.00 13.58 39.98 right | 67 1 9019 | 30.0
129 WindSpeed9am float64 0 100.0% | 0.00 13.0 87.00 8.77 14.19 right | 40 1 12503 | 9.0
130 WindSpeed3pm float64 0 100.0% | 0.00 19.0 87.00 8.67 18.75 left | 42 1 11937 | 17.0
131 Humidity9am float64 0 100.0% | -20.90 70.0 112.82 19.38 68.59 left | 1268 1 3239 | 65.0
132 Humidity3pm float64 0 100.0% | -14.43 51.0 104.33 20.91 50.99 left | 1764 1 2555 | 45.0
133 Pressure9am float64 0 100.0% | 980.50 1018.3 1041.00 8.37 1019.00 right | 9279 1 773 | 1027.2
134 Pressure3pm float64 0 100.0% | 977.10 1015.9 1039.60 8.25 1016.56 right | 9034 1 741 | 1018.9
135 Temp9am float64 0 100.0% | -7.20 16.7 40.20 6.58 16.98 right | 795 1 811 | 10.5
136 Temp3pm float64 0 100.0% | -5.40 21.3 46.70 7.06 21.81 right | 1325 1 784 | 23.1
137 Day int64 0 100.0% | 1.00 16.0 31.00 8.80 15.72 left | 31 2579 4369 | 2.0
138 Hour int64 0 100.0% | 0.00 0.0 0.00 0.00 0.00 equal | 1 132181 132181 | 0.0
139 Minute int64 0 100.0% | 0.00 0.0 0.00 0.00 0.00 equal | 1 132181 132181 | 0.0